from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE
def ppt_catch_format_text(filename): """ 抓取PPT的内容,按段落返回 其中 filename 是PPT文件的路径 """ prs = Presentation(filename) txt_oa = {} for x in range(len(prs.slides)): txt_oa[x] = [] # ---Only on text-boxes outside group elements--- for shape in prs.slides[x].shapes: if hasattr(shape, "text"): row_text = shape.text.encode('utf-8').strip().decode() txt_oa[x].append(row_text) # ---Only operate on group shapes--- group_shapes = [shp for shp in prs.slides[x].shapes if shp.shape_type ==MSO_SHAPE_TYPE.GROUP] for group_shape in group_shapes: for shape in group_shape.shapes: if shape.has_text_frame: row_text = shape.text.encode('utf-8').strip().decode() txt_oa[x].append(row_text) return txt_oa